Librerias
import plotly.express as px
import pandas as pd
import pickle
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
Jupyter notebooks
root_file = "/github/data-mining-2022/Hitos/H3/notebooks_clustering"
es_array_paths = [
f
for f in listdir(root_file)
if ("es" in f) and ("pickle" in f) and ("dm" in f or "report" in f or "projection" in f)
]
path = "../../Data/train/df_es_train.pickle"
df_es_train = pickle.load(open(path, "rb"))
sample_index_es = np.random.RandomState(0).choice(range(df_es_train.shape[0]), 20000)
labels = df_es_train.iloc[sample_index_es]["label"].values
es_results = []
for f in es_array_paths:
if "report" in f:
se, cm = f.replace("_es_report.pickle", "").split("_")
dict_m = pickle.load(open(root_file+"/"+f, "rb"))
for k, v in dict_m["metricas"].items():
o = {"Vector": se, "Clustering": cm, "Metrica": k, "Valor": v}
es_results.append(o)
df_es_results = pd.DataFrame(es_results)
df_es_results
| Vector | Clustering | Metrica | Valor | |
|---|---|---|---|---|
| 0 | beto | dbscan | Silhouette | -0.053405 |
| 1 | beto | dbscan | Rand score | 0.092397 |
| 2 | beto | dbscan | Mutual information | 0.000726 |
| 3 | beto | dbscan | Homogeneity | 0.000365 |
| 4 | beto | dbscan | Completeness | 0.100454 |
| ... | ... | ... | ... | ... |
| 91 | w2v | kmeans | Rand score | 0.819727 |
| 92 | w2v | kmeans | Mutual information | 0.026681 |
| 93 | w2v | kmeans | Homogeneity | 0.025777 |
| 94 | w2v | kmeans | Completeness | 0.027650 |
| 95 | w2v | kmeans | V-measure | 0.026681 |
96 rows × 4 columns
def plot_metric(df, metrica, lang="ES"):
fig = px.bar(df[df["Metrica"] == metrica], x="Clustering", y="Valor", color="Vector",
width=600, height=400)
fig.update_layout(
title=f"{metrica}: {lang}",
barmode='group')
fig.show(renderer="notebook")
plot_metric(df_es_results, metrica="Silhouette")
plot_metric(df_es_results, metrica="Rand score")
plot_metric(df_es_results, metrica="Mutual information")
plot_metric(df_es_results, metrica="Homogeneity")
plot_metric(df_es_results, metrica="Completeness")
plot_metric(df_es_results, metrica="V-measure")
es_dm = []
for f in es_array_paths:
if "dm" in f:
se, cm = f.replace("_es_dm.pickle", "").split("_")
dict_m = pickle.load(open(root_file+"/"+f, "rb"))
o = [(se, cm), dict_m]
es_dm.append(o)
fig, ax = plt.subplots(4, 4, figsize=(8, 8))
q1=0.05
x0=-70
scale=1.02
num_tags=5
for k, v in enumerate(es_dm):
v0, v1 = v
j, i = ["tfidf", "bow", "w2v", "beto"].index(v0[0]), ["kmeans", "gm", "hc", "dbscan"].index(v0[1])
if j==2: q3=0.95
else: q3=0.95
vmin = np.quantile(v1["matriz_distancias"].flatten(), q1)
vmax = np.quantile(v1["matriz_distancias"].flatten(), q3)
ax[i,j].imshow(v1["matriz_distancias"], cmap="jet", vmin=vmin, vmax=vmax)
ax[i,j].set_yticks([])
ax[i,j].set_xticks([])
if i==0:
ax[i,j].set_title(v0[0], size=18)
if j==0:
ax[i,j].set_ylabel(v0[1], size=18)
plt.suptitle("Matriz de distancias: ES", size=20);
fig.savefig('MD_ES.jpg', bbox_inches='tight')
es_prjt = {"pca": {}, "tsne": {}, "umap": {}}
for f in es_array_paths:
if "projection" in f:
se, rd = f.split("_")[:2]
reduced = pickle.load(open(root_file+"/"+f, "rb"))["reduced"]
es_prjt[rd][se] = reduced
es_clusters = {"kmeans": {}, "hc": {}, "gm": {}, "dbscan": {}}
for f in es_array_paths:
if "report" in f:
se, cm = f.replace("_es_report.pickle", "").split("_")
dict_m = pickle.load(open(root_file+"/"+f, "rb"))
es_clusters[cm][se] = dict_m["clusters"]
plot_sample_index_es = np.random.RandomState(0).choice(range(20000), 1000)
def plot_prj(prj):
alpha=0.5
ms=7
fig, ax = plt.subplots(4, 4, figsize=(10, 10))
for i, cm in enumerate(["kmeans", "gm", "hc", "dbscan"]):
for j, se in enumerate(["tfidf", "bow", "w2v", "beto"]):
clusters = es_clusters[cm][se][plot_sample_index_es]
x = es_prjt[prj][se][plot_sample_index_es,0]
y = es_prjt[prj][se][plot_sample_index_es,1]
sorted_clusters = pd.Series(clusters).value_counts().sort_values(ascending=False).index
for cluster in sorted_clusters:
ixs = (clusters == cluster)
ax[i, j].scatter(
x[ixs],
y[ixs],
alpha=alpha,
s=ms
)
ax[i,j].set_yticks([])
ax[i,j].set_xticks([])
if i==0:
ax[i,j].set_title(se, size=18)
if j==0:
ax[i,j].set_ylabel(cm, size=18)
plt.suptitle(f"{prj.upper()}: ES", size=20);
fig.savefig(f'{prj.upper()}_ES.jpg', bbox_inches='tight')
plot_prj("umap")
plot_prj("tsne")
plot_prj("pca")
root_file = "/github/data-mining-2022/Hitos/H3/notebooks_clustering"
us_array_paths = [
f
for f in listdir(root_file)
if ("us" in f) and ("pickle" in f) and ("dm" in f or "report" in f or "projection" in f)
]
path = "../../Data/train/df_us_train.pickle"
df_us_train = pickle.load(open(path, "rb"))
sample_index_us = np.random.RandomState(0).choice(range(df_us_train.shape[0]), 20000)
labels = df_us_train.iloc[sample_index_us]["label"].values
us_results = []
for f in us_array_paths:
if "report" in f:
se, cm = f.replace("_us_report.pickle", "").split("_")
dict_m = pickle.load(open(root_file+"/"+f, "rb"))
for k, v in dict_m["metricas"].items():
o = {"Vector": se, "Clustering": cm, "Metrica": k, "Valor": v}
us_results.append(o)
df_us_results = pd.DataFrame(us_results)
df_us_results
| Vector | Clustering | Metrica | Valor | |
|---|---|---|---|---|
| 0 | bertweet | dbscan | Silhouette | -0.196180 |
| 1 | bertweet | dbscan | Rand score | 0.219949 |
| 2 | bertweet | dbscan | Mutual information | 0.003311 |
| 3 | bertweet | dbscan | Homogeneity | 0.001925 |
| 4 | bertweet | dbscan | Completeness | 0.011831 |
| ... | ... | ... | ... | ... |
| 91 | w2v | kmeans | Rand score | 0.865099 |
| 92 | w2v | kmeans | Mutual information | 0.062371 |
| 93 | w2v | kmeans | Homogeneity | 0.064443 |
| 94 | w2v | kmeans | Completeness | 0.060427 |
| 95 | w2v | kmeans | V-measure | 0.062371 |
96 rows × 4 columns
plot_metric(df_us_results, metrica="Silhouette", lang="US")
plot_metric(df_us_results, metrica="Rand score", lang="US")
plot_metric(df_us_results, metrica="Mutual information", lang="US")
plot_metric(df_us_results, metrica="Homogeneity", lang="US")
plot_metric(df_us_results, metrica="Completeness", lang="US")
plot_metric(df_us_results, metrica="V-measure", lang="US")
us_dm = []
for f in us_array_paths:
if "dm" in f:
se, cm = f.replace("_us_dm.pickle", "").split("_")
dict_m = pickle.load(open(root_file+"/"+f, "rb"))
o = [(se, cm), dict_m]
us_dm.append(o)
fig, ax = plt.subplots(4, 4, figsize=(8, 8))
q1=0.05
x0=-70
scale=1.02
num_tags=5
for k, v in enumerate(us_dm):
v0, v1 = v
j, i = ["tfidf", "bow", "w2v", "bertweet"].index(v0[0]), ["kmeans", "gm", "hc", "dbscan"].index(v0[1])
if j==2: q3=0.95
else: q3=0.95
vmin = np.quantile(v1["matriz_distancias"].flatten(), q1)
vmax = np.quantile(v1["matriz_distancias"].flatten(), q3)
ax[i,j].imshow(v1["matriz_distancias"], cmap="jet", vmin=vmin, vmax=vmax)
ax[i,j].set_yticks([])
ax[i,j].set_xticks([])
if i==0:
ax[i,j].set_title(v0[0], size=18)
if j==0:
ax[i,j].set_ylabel(v0[1], size=18)
plt.suptitle("Matriz de distancias: US", size=20);
fig.savefig('MD_US.jpg', bbox_inches='tight')
us_prjt = {"pca": {}, "tsne": {}, "umap": {}}
for f in us_array_paths:
if "projection" in f:
se, rd = f.split("_")[:2]
reduced = pickle.load(open(root_file+"/"+f, "rb"))["reduced"]
us_prjt[rd][se] = reduced
us_clusters = {"kmeans": {}, "hc": {}, "gm": {}, "dbscan": {}}
for f in us_array_paths:
if "report" in f:
se, cm = f.replace("_us_report.pickle", "").split("_")
dict_m = pickle.load(open(root_file+"/"+f, "rb"))
us_clusters[cm][se] = dict_m["clusters"]
plot_sample_index_us = np.random.RandomState(0).choice(range(20000), 1000)
def plot_prj(prj):
alpha=0.5
ms=7
fig, ax = plt.subplots(4, 4, figsize=(10, 10))
for i, cm in enumerate(["kmeans", "gm", "hc", "dbscan"]):
for j, se in enumerate(["tfidf", "bow", "w2v", "bertweet"]):
clusters = us_clusters[cm][se][plot_sample_index_us]
x = us_prjt[prj][se][plot_sample_index_us,0]
y = us_prjt[prj][se][plot_sample_index_us,1]
sorted_clusters = pd.Series(clusters).value_counts().sort_values(ascending=False).index
for cluster in sorted_clusters:
ixs = (clusters == cluster)
ax[i, j].scatter(
x[ixs],
y[ixs],
alpha=alpha,
s=ms
)
ax[i,j].set_yticks([])
ax[i,j].set_xticks([])
if i==0:
ax[i,j].set_title(se, size=18)
if j==0:
ax[i,j].set_ylabel(cm, size=18)
plt.suptitle(f"{prj.upper()}: US", size=20);
fig.savefig(f'{prj.upper()}_US.jpg', bbox_inches='tight')
plot_prj("umap")
plot_prj("tsne")
plot_prj("pca")